{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0e24b017",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "52db814ebf4b475bb905233f08ba3fee",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'/home/mesolitica/stt'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "\n",
    "snapshot_download(\n",
    "    repo_type='dataset',\n",
    "    repo_id=\"mesolitica/Malaysian-Speech-Instructions\", \n",
    "    allow_patterns=\"without-audio/*.parquet\", local_dir = './')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8781c464",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from huggingface_hub import snapshot_download\n",
    "\n",
    "# snapshot_download(\n",
    "#     repo_type='dataset',\n",
    "#     repo_id=\"mesolitica/Malaysian-STT-Whisper-Stage2\", \n",
    "#     allow_patterns='prepared-mixed-malaysian-instructions*.zip', local_dir = './')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1fdc63ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "import zipfile\n",
    "import os\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))\n",
    "\n",
    "def loop(files):\n",
    "    files, _ = files\n",
    "    for zip_file_path in tqdm(files):\n",
    "        destination_folder = './'\n",
    "        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
    "            zip_ref.extractall(destination_folder)\n",
    "        os.remove(zip_file_path)\n",
    "\n",
    "# files = glob('*.zip')\n",
    "# if len(files):\n",
    "#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4274a22f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoProcessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9630b07a",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-Audio-7B-Instruct\")\n",
    "tokenizer = processor.tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ef2fbe87",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n{% endif %}<|im_start|>{{ message['role'] }}\\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\\n{% endif %}\""
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor.chat_template"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "47d72b4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def loop(rows):\n",
    "    rows, _ = rows\n",
    "    data = []\n",
    "    for r in tqdm(rows):\n",
    "        try:\n",
    "            conversation = json.loads(r['prompt'])\n",
    "            messages = []\n",
    "            for c in conversation:\n",
    "                if c['content'] is None:\n",
    "                    break\n",
    "                messages.append(c)\n",
    "            if messages[-1]['role'] == 'user':\n",
    "                messages = messages[:-1]\n",
    "            text = processor.apply_chat_template(messages, tokenize=False)\n",
    "            if '<|AUDIO|>' not in text:\n",
    "                continue\n",
    "        except Exception as e:\n",
    "            continue\n",
    "        f = r['audio_filename']\n",
    "        if not os.path.exists(f):\n",
    "            continue\n",
    "\n",
    "        data.append({\n",
    "            'text': text,\n",
    "            'audio': f,\n",
    "        })\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0fde8ec2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "722004"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = []\n",
    "for f in glob('without-audio/*.parquet'):\n",
    "    data.extend(pd.read_parquet(f).to_dict(orient = 'records'))\n",
    "    \n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "eab3a846",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1201.53it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed = loop((data[-10:], 0))\n",
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "09dabc7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5510.31it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4678.51it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5333.67it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4878.42it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4910.16it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5284.19it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4864.85it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4920.43it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4584.16it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5163.91it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5518.83it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5523.32it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5547.65it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5508.57it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 3632.22it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5481.93it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:06<00:00, 5483.93it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 5102.66it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 4742.57it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 5033.07it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████| 36100/36100 [00:07<00:00, 5052.14it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = multiprocessing(data, loop, cores = 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f2a6a9e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "720969"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "81c5879a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': '<|im_start|>system\\nAct as a voice assistant chatbot. Keep every response under 300 characters. Be accurate, brief, and easy to understand when spoken aloud. Don’t overexplain or repeat. Ask for clarification only when needed. Prioritize clarity and brevity at all times.<|im_end|>\\n<|im_start|>user\\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\\n<|im_end|>\\n<|im_start|>assistant\\nD. menyeberangi<|im_end|>\\n',\n",
       " 'audio': 'mallm-v3/6562.mp3'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2e7117ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('prepare-Malaysian-Speech-Instructions.json', 'w') as fopen:\n",
    "    json.dump(processed, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
